HabanaAI · yeonsily · Feb 6, 2024 · Feb 3, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/examples/stable-diffusion/plot_loss_curve.py b/examples/stable-diffusion/plot_loss_curve.py
@@ -3,7 +3,8 @@
 import numpy as np
 from scipy.signal import savgol_filter
 
-SAMPLE=50
+def sample(x):
+    return x//500
 
 def test():
     def match(x,y):
@@ -77,18 +78,22 @@ def parse(flnm, smooth_fn=lambda x:x, clip_first=100):
             if not filter_fn(ln):
                 continue
             step, step_loss = strip_ln(ln)
+
             if 'step_loss' in ln:
                 if prev_step != int(step):
                     loss.append(last_loss)
                     steps.append(int(prev_step))
                     prev_step = int(step)
+                    #print("\nstep/loss", step, last_loss)
                 else:
                     last_loss = float(step_loss)
             #TODO: parse eval epoch?
         loss.append(last_loss)
         steps.append(int(prev_step))
+    SAMPLE= sample(len(loss))
     loss = (loss[clip_first:])[::SAMPLE]
     steps = (steps[clip_first:])[::SAMPLE]
+
     loss = smooth_fn(loss)
     #epoch=fix(epoch) #TODO uncomment this
     return steps, loss, eval_epoch, eval_samples_per_sec, flnm.split('/')[-1].split('.')[0]

diff --git a/examples/stable-diffusion/run.sh b/examples/stable-diffusion/run.sh
@@ -6,19 +6,11 @@ python train_text_to_image_sdxl.py \
   --center_crop \
   --random_flip \
   --proportion_empty_prompts=0.2 \
-  --train_batch_size 1 \
-  --gradient_accumulation_steps 4 \
-  --gradient_checkpointing \
-  --max_train_steps 1 \
+  --train_batch_size 16\
+  --max_train_steps 10000 \
   --learning_rate 1e-06 \
   --lr_scheduler constant \
   --lr_warmup_steps 0 \
+  --gaudi_config Habana/stable-diffusion \
   --bf16 \
-  --validation_prompt="a cute Sundar Pichai creature" \
-  --validation_epochs 5 \
-  --checkpointing_steps=5000 \
-  --output_dir sdxl-pokemon-model \
-  --gaudi_config_name Habana/stable-diffusion \
-  --throughput_warmup_steps 3 \
-  --use_hpu_graphs \
-  --cache_dir /root/software/data/pytorch/huggingface/sdxl 2>&1 | tee log.txt
+  --cache_dir /root/software/data/pytorch/huggingface/sdxl 
diff --git a/examples/stable-diffusion/run_1x_bs1.sh → examples/stable-diffusion/run_1x_bs16.sh b/examples/stable-diffusion/run_1x_bs1.sh → examples/stable-diffusion/run_1x_bs16.sh
@@ -6,9 +6,8 @@ python train_text_to_image_sdxl.py \
   --center_crop \
   --random_flip \
   --proportion_empty_prompts=0.2 \
-  --train_batch_size 1 \
-  --gradient_accumulation_steps 4 \
-  --max_train_steps 1 \
+  --train_batch_size 16 \
+  --max_train_steps 2500 \
   --learning_rate 1e-05 \
   --max_grad_norm 1 \
   --lr_scheduler constant \
@@ -17,7 +16,8 @@ python train_text_to_image_sdxl.py \
   --gaudi_config_name Habana/stable-diffusion \
   --throughput_warmup_steps 3 \
   --bf16 \
-  --validation_prompt="a cute Sundar Pichai creature" \
-  --validation_epochs 5 \
+  --validation_prompt="a horse running on the beach during sunset" \
+  --validation_epochs 48 \
   --use_hpu_graphs \
-  --cache_dir /root/software/data/pytorch/huggingface/sdxl 2>&1 | tee log.txt
+  --checkpointing_steps 2500 \
+  --cache_dir /root/software/data/pytorch/huggingface/sdxl 2>&1 | tee log_1x_bs16.txt
diff --git a/examples/stable-diffusion/train_text_to_image_sdxl.py b/examples/stable-diffusion/train_text_to_image_sdxl.py
@@ -1025,17 +1025,17 @@ def unwrap_model(model):
 
             with accelerator.accumulate(unet):
                 # Sample noise that we'll add to the latents
-                model_input = batch["model_input"].to(dtype=weight_dtype).to(accelerator.device)
+
+                model_input = batch["model_input"].to(dtype=weight_dtype)
+
                 noise = torch.randn_like(model_input)
                 if args.noise_offset:
                     # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                    # torch.randn is broken on HPU so we need workaround using CPU here
-                    #rand_device = "cpu" if model_input.device.type == "hpu" else model_input.device
                     rand_device = model_input.device
                     noise += args.noise_offset * torch.randn(
                         (model_input.shape[0], model_input.shape[1], 1, 1), device=rand_device
                     )
-                    noise = noise.to(model_input.device)
+                noise = noise.to(model_input.device)
 
                 bsz = model_input.shape[0]
 
@@ -1056,7 +1056,6 @@ def unwrap_model(model):
                 # Add noise to the model input according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
                 noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
-
                 # time ids
                 def compute_time_ids(original_size, crops_coords_top_left):
                     # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
@@ -1069,7 +1068,6 @@ def compute_time_ids(original_size, crops_coords_top_left):
                 add_time_ids = torch.cat(
                     [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
                 )
-
                 # Predict the noise residual
                 unet_added_conditions = {"time_ids": add_time_ids}
                 prompt_embeds = batch["prompt_embeds"].to(accelerator.device)
@@ -1121,14 +1119,14 @@ def compute_time_ids(original_size, crops_coords_top_left):
 
                 # Gather the losses across all processes for logging (if we use distributed training).
                 avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
-                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+                train_loss += avg_loss / args.gradient_accumulation_steps
 
                 # Backpropagate
                 #TODO: check why this cause bufferoverflow issue
-                #with accelerator.autocast():    
+                #with torch.autocast(device_type="hpu", dtype=weight_dtype, enabled=True):
                 accelerator.backward(loss)
                 htcore.mark_step()
-                
+
                 if accelerator.sync_gradients:
                     params_to_clip = unet.parameters()
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
@@ -1177,7 +1175,7 @@ def compute_time_ids(original_size, crops_coords_top_left):
                 break
 
         if accelerator.is_main_process:
-            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+            if args.validation_prompt is not None and (epoch+1) % args.validation_epochs == 0:
                 logger.info(
                     f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
                     f" {args.validation_prompt}."