From 3d0e302fb7b60868270ccc860efe016a839c5ea8 Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Thu, 14 Mar 2024 18:24:48 +0000 Subject: [PATCH 1/2] fixes --- examples/stable-diffusion/requirements.txt | 1 + examples/stable-diffusion/training/run_1x.sh | 3 ++- examples/stable-diffusion/training/run_8x.sh | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/stable-diffusion/requirements.txt b/examples/stable-diffusion/requirements.txt index 0dd006bbc3..272932f9b8 100644 --- a/examples/stable-diffusion/requirements.txt +++ b/examples/stable-diffusion/requirements.txt @@ -1 +1,2 @@ opencv-python +imagesize \ No newline at end of file diff --git a/examples/stable-diffusion/training/run_1x.sh b/examples/stable-diffusion/training/run_1x.sh index 2a8a7dad66..0c87c98503 100755 --- a/examples/stable-diffusion/training/run_1x.sh +++ b/examples/stable-diffusion/training/run_1x.sh @@ -23,4 +23,5 @@ python train_text_to_image_sdxl.py \ --validation_prompt="a robotic cat with wings" \ --validation_epochs 48 \ --checkpointing_steps 2500 \ - --logging_step 10 --discount_chkpoint_saving_in_throughput 2>&1 | tee log_1x_r512.txt + --logging_step 10 \ + --adjust_throughput 2>&1 | tee log_1x_r512.txt diff --git a/examples/stable-diffusion/training/run_8x.sh b/examples/stable-diffusion/training/run_8x.sh index c14e95c3ca..cd38543ebf 100755 --- a/examples/stable-diffusion/training/run_8x.sh +++ b/examples/stable-diffusion/training/run_8x.sh @@ -25,4 +25,4 @@ python ../../gaudi_spawn.py --world_size 8 --use_mpi train_text_to_image_sdxl.py --validation_epochs 48 \ --checkpointing_steps 336 \ --mediapipe dataset_sdxl_pokemon \ - --discount_chkpoint_saving_in_throughput 2>&1 | tee log_8x_r512.txt + --adjust_throughput 2>&1 | tee log_8x_r512.txt From b86e0d67b5a8f3a05a246516f97a56d9985fdd42 Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Thu, 14 Mar 2024 18:51:40 +0000 Subject: [PATCH 2/2] change flag name --- .../stable-diffusion/training/train_text_to_image_sdxl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py index 53b783b311..881669414b 100644 --- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py +++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py @@ -525,10 +525,10 @@ def parse_args(input_args=None): case 3: a non empty path is passed -> images from that location are used ", ) parser.add_argument( - "--discount_chkpoint_saving_in_throughput", + "--adjust_throughput", default=False, action="store_true", - help="Checkpoitn saving takes a lot of time. Ignore time for checkpoint saving for throughput calculations" + help="Checkpoint saving takes a lot of time. Ignore time for checkpoint saving for throughput calculations" ) @@ -1340,7 +1340,7 @@ def compute_time_ids(original_size, crops_coords_top_left): del pipeline - duration = time.perf_counter() - t0 - (checkpoint_time if args.discount_chkpoint_saving_in_throughput else 0) + duration = time.perf_counter() - t0 - (checkpoint_time if args.adjust_throughput else 0) ttt = time.perf_counter() - t_start throughput = (args.max_train_steps - args.throughput_warmup_steps) * total_batch_size / duration