huggingface · regisss · Dec 10, 2024 · Dec 4, 2024 · Dec 6, 2024
@@ -175,6 +175,22 @@ outputs = generator(
 ```
 
 
+## Important Note on Pytorch 2.5 Performance Degradation
+
+With the upgrade to PyTorch 2.5, users may experience some performance degradation due to changes in the handling of FP16/BF16 inputs. The note from PyTorch 2.5 states:
+
+"A naive SDPA math backend, when using FP16/BF16 inputs, can accumulate significant numerical errors due to the usage of low-precision intermediate buffers. To mitigate this issue, the default behavior now involves upcasting FP16/BF16 inputs to FP32. Computations are performed in FP32/TF32, and the final FP32 results are then downcasted back to FP16/BF16. This will improve numerical accuracy of the final output for the math backend with FP16/BF16 inputs, but increases memory usages and may cause the performance regressions in the math backend as computations shift from FP16/BF16 BMM to FP32/TF32 BMM/Matmul."
+
+For scenarios where reduced-precision reductions are preferred for speed, they can be enabled with the following setting:
+```python
+torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+```
+Additionally, the next release of Optimum Habana will include a Gaudi-specific safe_softmax implementation that will also improve performance.
+
+More info:
+- https://pytorch.org/docs/stable/notes/numerical_accuracy.html
+
+
 ### Documentation
 
 Check out [the documentation of Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/habana/index) for more advanced usage.

@@ -56,6 +56,7 @@ python run_audio_classification.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code True
 ```
@@ -93,6 +94,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --use_lazy_mode False\
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code True \
     --torch_compile \
@@ -173,6 +175,7 @@ python run_audio_classification.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/wav2vec2 \
+    --sdp_on_bf16 \
     --bf16 \
     --trust_remote_code True\
     --torch_compile \

@@ -57,6 +57,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 6 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -107,6 +108,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 3 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -211,6 +213,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --gaudi_config_name Habana/vit \
     --throughput_warmup_steps 8 \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -298,6 +301,7 @@ python run_image_classification.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/vit \
     --dataloader_num_workers 1 \
+    --sdp_on_bf16 \
     --bf16
 ```
 

@@ -101,6 +101,7 @@
     warmup_ratio=0.1,
     # fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
     # bf16=False,  # Set to True if you have a GPU that supports BF16
+    # sdp_on_bf16=True, #Set to True for better performance (but this setting can affect accuracy)
     batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
     # We can use ROUND_ROBIN or PROPORTIONAL - to avoid focusing too much on one dataset, we will
     # use round robin, which samples the same amount of batches from each dataset, until one dataset is empty

@@ -198,6 +198,7 @@ python train_controlnet.py \
  --train_batch_size=4 \
  --throughput_warmup_steps=3 \
  --use_hpu_graphs \
+ --sdp_on_bf16 \
  --bf16 \
  --trust_remote_code
 ```
@@ -217,6 +218,7 @@ python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \
   --train_batch_size=4 \
   --throughput_warmup_steps 3 \
   --use_hpu_graphs \
+  --sdp_on_bf16 \
   --bf16 \
   --trust_remote_code
 ```
@@ -295,6 +297,7 @@ python train_text_to_image_sdxl.py \
   --gaudi_config_name Habana/stable-diffusion \
   --throughput_warmup_steps 3 \
   --dataloader_num_workers 8 \
+  --sdp_on_bf16 \
   --bf16 \
   --use_hpu_graphs_for_training \
   --use_hpu_graphs_for_inference \
@@ -330,6 +333,7 @@ python ../../gaudi_spawn.py --world_size 8 --use_mpi train_text_to_image_sdxl.py
   --gaudi_config_name Habana/stable-diffusion \
   --throughput_warmup_steps 3 \
   --dataloader_num_workers 8 \
+  --sdp_on_bf16 \
   --bf16 \
   --use_hpu_graphs_for_training \
   --use_hpu_graphs_for_inference \
@@ -365,6 +369,7 @@ python train_text_to_image_sdxl.py \
   --use_hpu_graphs_for_training \
   --use_hpu_graphs_for_inference \
   --checkpointing_steps 3000 \
+  --sdp_on_bf16 \
   --bf16
 ```
 
@@ -498,6 +503,7 @@ python ../text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```
 
@@ -695,5 +701,6 @@ python ../text_to_image_generation.py \
     --use_habana \
     --use_hpu_graphs \
     --gaudi_config Habana/stable-diffusion \
+    --sdp_on_bf16 \
     --bf16
 ```