From d34a9c45420d4069e2a185a3ef6c79bac2a73294 Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Wed, 20 Mar 2024 18:35:57 +0200 Subject: [PATCH 1/2] Fix barrier --- examples/stable-diffusion/training/train_text_to_image_sdxl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py index 881669414b..21c81f6b2c 100644 --- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py +++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py @@ -799,7 +799,8 @@ def main(args): for idx, dt in enumerate(dataset['train']): dt['image'].save(f'{args.mediapipe}/{idx}.jpg') f.write(dt['text'] + '\n') - torch.distributed.barrier() + if torch.distributed.is_initialized(): + torch.distributed.barrier() from media_pipe_imgdir import get_dataset_for_pipeline dt = get_dataset_for_pipeline(args.mediapipe) dataset = {'train': dt} From f49a06e8fd08542feb4d430fc45775f8ce72825c Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Wed, 20 Mar 2024 18:43:19 +0200 Subject: [PATCH 2/2] align check to OH --- examples/stable-diffusion/training/train_text_to_image_sdxl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py index 21c81f6b2c..6035e3cf47 100644 --- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py +++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py @@ -799,7 +799,7 @@ def main(args): for idx, dt in enumerate(dataset['train']): dt['image'].save(f'{args.mediapipe}/{idx}.jpg') f.write(dt['text'] + '\n') - if torch.distributed.is_initialized(): + if accelerator.distributed_type != GaudiDistributedType.NO: torch.distributed.barrier() from media_pipe_imgdir import get_dataset_for_pipeline dt = get_dataset_for_pipeline(args.mediapipe)