diff --git a/README.md b/README.md index 7776f4cc5a..28dd124121 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,22 @@ outputs = generator( ``` +## Important Note on Pytorch 2.5 Performance Degradation + +With the upgrade to PyTorch 2.5, users may experience some performance degradation due to changes in the handling of FP16/BF16 inputs. The note from PyTorch 2.5 states: + +"A naive SDPA math backend, when using FP16/BF16 inputs, can accumulate significant numerical errors due to the usage of low-precision intermediate buffers. To mitigate this issue, the default behavior now involves upcasting FP16/BF16 inputs to FP32. Computations are performed in FP32/TF32, and the final FP32 results are then downcasted back to FP16/BF16. This will improve numerical accuracy of the final output for the math backend with FP16/BF16 inputs, but increases memory usages and may cause the performance regressions in the math backend as computations shift from FP16/BF16 BMM to FP32/TF32 BMM/Matmul." + +For scenarios where reduced-precision reductions are preferred for speed, they can be enabled with the following setting: +```python +torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) +``` +Additionally, the next release of Optimum Habana will include a Gaudi-specific safe_softmax implementation that will also improve performance. + +More info: +- https://pytorch.org/docs/stable/notes/numerical_accuracy.html + + ### Documentation Check out [the documentation of Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/habana/index) for more advanced usage. diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md index aaa45425cc..dafced7a58 100644 --- a/examples/audio-classification/README.md +++ b/examples/audio-classification/README.md @@ -56,6 +56,7 @@ python run_audio_classification.py \ --use_hpu_graphs_for_inference \ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ + --sdp_on_bf16 \ --bf16 \ --trust_remote_code True ``` @@ -93,6 +94,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \ --use_lazy_mode False\ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ + --sdp_on_bf16 \ --bf16 \ --trust_remote_code True \ --torch_compile \ @@ -173,6 +175,7 @@ python run_audio_classification.py \ --use_lazy_mode \ --use_hpu_graphs_for_inference \ --gaudi_config_name Habana/wav2vec2 \ + --sdp_on_bf16 \ --bf16 \ --trust_remote_code True\ --torch_compile \ diff --git a/examples/image-classification/README.md b/examples/image-classification/README.md index 08c4d67123..01b19b25ba 100644 --- a/examples/image-classification/README.md +++ b/examples/image-classification/README.md @@ -57,6 +57,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \ --gaudi_config_name Habana/vit \ --throughput_warmup_steps 6 \ --dataloader_num_workers 1 \ + --sdp_on_bf16 \ --bf16 ``` @@ -107,6 +108,7 @@ PT_HPU_LAZY_MODE=0 python run_image_classification.py \ --gaudi_config_name Habana/vit \ --throughput_warmup_steps 3 \ --dataloader_num_workers 1 \ + --sdp_on_bf16 \ --bf16 ``` @@ -211,6 +213,7 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \ --gaudi_config_name Habana/vit \ --throughput_warmup_steps 8 \ --dataloader_num_workers 1 \ + --sdp_on_bf16 \ --bf16 ``` @@ -298,6 +301,7 @@ python run_image_classification.py \ --use_hpu_graphs_for_inference \ --gaudi_config_name Habana/vit \ --dataloader_num_workers 1 \ + --sdp_on_bf16 \ --bf16 ``` diff --git a/examples/sentence-transformers-training/paraphrases/training_paraphrases.py b/examples/sentence-transformers-training/paraphrases/training_paraphrases.py index d31bfd5796..67cb54f12b 100644 --- a/examples/sentence-transformers-training/paraphrases/training_paraphrases.py +++ b/examples/sentence-transformers-training/paraphrases/training_paraphrases.py @@ -101,6 +101,7 @@ warmup_ratio=0.1, # fp16=True, # Set to False if you get an error that your GPU can't run on FP16 # bf16=False, # Set to True if you have a GPU that supports BF16 + # sdp_on_bf16=True, #Set to True for better performance (but this setting can affect accuracy) batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch # We can use ROUND_ROBIN or PROPORTIONAL - to avoid focusing too much on one dataset, we will # use round robin, which samples the same amount of batches from each dataset, until one dataset is empty diff --git a/examples/stable-diffusion/training/README.md b/examples/stable-diffusion/training/README.md index a10c194066..afa4a0a61f 100644 --- a/examples/stable-diffusion/training/README.md +++ b/examples/stable-diffusion/training/README.md @@ -198,6 +198,7 @@ python train_controlnet.py \ --train_batch_size=4 \ --throughput_warmup_steps=3 \ --use_hpu_graphs \ + --sdp_on_bf16 \ --bf16 \ --trust_remote_code ``` @@ -217,6 +218,7 @@ python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \ --train_batch_size=4 \ --throughput_warmup_steps 3 \ --use_hpu_graphs \ + --sdp_on_bf16 \ --bf16 \ --trust_remote_code ``` @@ -295,6 +297,7 @@ python train_text_to_image_sdxl.py \ --gaudi_config_name Habana/stable-diffusion \ --throughput_warmup_steps 3 \ --dataloader_num_workers 8 \ + --sdp_on_bf16 \ --bf16 \ --use_hpu_graphs_for_training \ --use_hpu_graphs_for_inference \ @@ -330,6 +333,7 @@ python ../../gaudi_spawn.py --world_size 8 --use_mpi train_text_to_image_sdxl.py --gaudi_config_name Habana/stable-diffusion \ --throughput_warmup_steps 3 \ --dataloader_num_workers 8 \ + --sdp_on_bf16 \ --bf16 \ --use_hpu_graphs_for_training \ --use_hpu_graphs_for_inference \ @@ -365,6 +369,7 @@ python train_text_to_image_sdxl.py \ --use_hpu_graphs_for_training \ --use_hpu_graphs_for_inference \ --checkpointing_steps 3000 \ + --sdp_on_bf16 \ --bf16 ``` @@ -498,6 +503,7 @@ python ../text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -695,5 +701,6 @@ python ../text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ```